In this notebook, we are using the tmb_genomic.tsv file
generated from the 01-preprocess-data.Rmd script.
suppressPackageStartupMessages({
library(tidyverse)
})
# Detect the ".git" folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
scratch_dir <- file.path(root_dir, "scratch")
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
input_dir <- file.path(analysis_dir, "input")
# Input files
tmb_genomic_file <- file.path(scratch_dir, "tmb_genomic.tsv")
tumor_descriptor_color_palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
# File path to plots directory
plots_dir <-
file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
source(paste0(analysis_dir, "/util/function-create-barplot-v1.R"))
source(paste0(root_dir, "/figures/scripts/theme.R"))
# Read and process tmb_genomic file
tmb_genomic_all <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE)
# Are there any samples with both WGS and WXS?
tmb_genomic_all %>%
unique() %>%
arrange(Kids_First_Participant_ID, experimental_strategy) %>%
group_by(Kids_First_Participant_ID) %>%
summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = ";"))
# Yes, they are, so let's remove these from downstream analyses.
tmb_genomic <- tmb_genomic_all %>%
filter(!experimental_strategy == "WXS")
# Read color palette
tumor_descriptor_color_palette <- readr::read_tsv(tumor_descriptor_color_palette_file, guess_max = 100000, show_col_types = FALSE)
We will explore TMB per Kids_First_Participant_ID over
time by creating stacked barplots.
# Define parameters for function
ylim <- 360
tmb_df <- tmb_genomic
# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic.pdf"
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 6)
print(p)
dev.off()
quartz_off_screen
2
Note that samples with Low TMB defined as ≤5 mutations/Mb, intermediate TMB defined as >5 and ≤20/Mb, high TMB defined as >20 and ≤50 Mb, and very high TMB defined as >50 mutations/Mb.
We notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 20) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.
# Filter df
tmb_genomic_filter <- tmb_genomic %>%
filter(!tmb >= 20) %>%
unique() %>%
arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
group_by(Kids_First_Participant_ID) %>%
summarise(tumor_descriptor_sum = str_c(tumor_descriptor, collapse = ";")) %>%
filter(!tumor_descriptor_sum %in% c("Diagnosis", "Progressive", "Recurrence")) %>%
left_join(tmb_genomic, by = c("Kids_First_Participant_ID", "tumor_descriptor_sum")) %>%
mutate(cancer_group_sum = ifelse(short_histology == "HGAT", "High-grade glioma",
ifelse(short_histology == "LGAT", "Low-grade glioma", "Other cancer group")),
cancer_group_sum = replace_na(cancer_group_sum, "Other"),
patient_id = paste(short_histology, Kids_First_Participant_ID, sep = "_"),
patient_bs_id = paste(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, sep = "_")) %>%
drop_na(tmb)
# Define parameters for function
ylim <- 12.5
tmb_df <- tmb_genomic_filter
# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic-no-hypermutants.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-no-hypermutants.pdf"
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 8)
print(p)
dev.off()
quartz_off_screen
2
We will explore TMB per cancer group over time by creating stacked barplots. We will plot based on cancer groups presenting with the highest number of samples (High- and Low-grade gliomas) vesrus any other cancer groups.
cancer_groups <- unique(as.character(tmb_genomic_filter$cancer_group_sum))
print(cancer_groups)
[1] "Other cancer group" "Low-grade glioma" "High-grade glioma"
for (i in seq_along(cancer_groups)) {
print(i)
tmb_genomic_filter_sub <- tmb_genomic_filter %>%
filter(cancer_group_sum == cancer_groups [i])
if(i == 1) {
print (cancer_groups [i])
# Define parameters for function
ylim <- 8
} else if (i == 2) {
print (cancer_groups [i])
# Define parameters for function
ylim <- 4.5
} else {
print (cancer_groups [i])
# Define parameters for function
ylim <- 12.5
}
# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic", "-", cancer_groups[i], ".pdf")
print(fname)
p <- create_stacked_barplot_cancer_group_sum(tmb_df = tmb_genomic_filter_sub, ylim = ylim, ct_id = cancer_groups[i])
pdf(file = fname, width = 12, height = 8)
print(p)
dev.off()
}
[1] 1
[1] "Other cancer group"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-Other cancer group.pdf"
[1] 2
[1] "Low-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-Low-grade glioma.pdf"
[1] 3
[1] "High-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-High-grade glioma.pdf"
Here, we want to explore the number of mutations
(mutation_count column) per timepoint and biospecimen
sample per patient case by creating barplots.
# Define parameters for function
ylim <- 260
tmb_df = tmb_genomic_filter
# Run function
fname <- paste0(plots_dir, "/", "Total-Mutations-patient_bs_id.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Total-Mutations-patient_bs_id.pdf"
p <- create_barplot_sample(tmb_df = tmb_df,
ylim = ylim)
pdf(file = fname, width = 25, height = 10)
print(p)
dev.off()
quartz_off_screen
2
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.4.1
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggthemes_4.2.4 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1 readr_2.1.4
[8] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] pillar_1.9.0 compiler_4.2.3 bslib_0.5.0 jquerylib_0.1.4 tools_4.2.3 bit_4.0.5
[7] digest_0.6.33 timechange_0.2.0 jsonlite_1.8.7 evaluate_0.21 lifecycle_1.0.3 gtable_0.3.3
[13] pkgconfig_2.0.3 rlang_1.1.1 cli_3.6.1 rstudioapi_0.15.0 parallel_4.2.3 yaml_2.3.7
[19] xfun_0.39 fastmap_1.1.1 withr_2.5.0 knitr_1.43 generics_0.1.3 vctrs_0.6.3
[25] sass_0.4.7 hms_1.1.3 bit64_4.0.5 rprojroot_2.0.3 tidyselect_1.2.0 glue_1.6.2
[31] R6_2.5.1 fansi_1.0.4 vroom_1.6.3 rmarkdown_2.23 farver_2.1.1 tzdb_0.4.0
[37] magrittr_2.0.3 scales_1.2.1 htmltools_0.5.5 colorspace_2.1-0 labeling_0.4.2 utf8_1.2.3
[43] stringi_1.7.12 munsell_0.5.0 cachem_1.0.8 crayon_1.5.2